# Load data and libraries
rm(list = ls())

library(tidyverse)
library(lubridate)
library(ggthemes)
library(readxl)
library(rddensity)
library(pinyin)
library(knitr)
library(RColorBrewer)
library(ggrepel)
library(ggpubr)
library(lfe)
library(GGally)

theme_set(theme_classic())

# Set default repository
setwd(dirname(rstudioapi::getSourceEditorContext()$path))

# Assuming you've already created and transformed the spatial objects
df = read_csv("final_data.csv") 

df$speed_en = factor(
  df$speed_en,  
  levels = c( "On time (<10 days)",
              "Minor Delay (10-13 days)",
              "Intermediate Delay (13-15 days)",
              "Significant Delay (>15 days)")
  )


# Figure 3: Outcome Variable Distribution ####

questionr::describe(df$speed_en)
questionr::describe(df$resolve_en)

vis_duration = df %>% 
  filter(!is.na(speed_en)) %>% 
  group_by(speed_en, year) %>% 
  tally() %>% 
  ggplot(aes(x = year, y = n, fill = speed_en)) +
  geom_bar(position = "stack", stat = "identity", alpha = 0.8, width = 0.6) +
  ylab("Number of Cases") +
  ggtitle("Resolution Time") +
  xlab("") +
  theme_classic(15) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.title = element_blank(),
        legend.position = "bottom") +
  guides(fill = guide_legend(nrow = 2)) +
  scale_fill_brewer(palette = "RdPu", direction = -1)

vis_resolve = df %>% 
  group_by(resolve_en, year) %>% 
  tally() %>% 
  ggplot(aes(x = year, y = n, fill = resolve_en)) +
  geom_bar(position = "stack", stat = "identity", width = 0.6) +
  xlab("") +
  ggtitle("Positive Resolution") +
  ylab("Number of Cases") +
  theme_classic(15) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.title = element_blank(),
        legend.position = "bottom") +
  guides(fill = guide_legend(nrow = 2)) +
  scale_fill_brewer(palette = "RdPu", direction = -1)

ggpubr::ggarrange(vis_duration, vis_resolve, label.y = "Number of Cases")

ggsave("figure/describe_two_dev.png", width = 12.2, height = 4.5)


# Figure 5: Petition Topics ####
tmp_left = read_rds("tmp_left.rds")

p1 <- ggplot(tmp_left, aes(x = reorder(topic, prop), y = prop)) +
  geom_bar(stat = "identity", width = 0.6, fill = "#cdb4db", alpha = 0.8) +
  geom_text(aes(label = reorder(words, prop)), hjust = -0.2, size = 4) +
  ylim(0, 0.3) +
  coord_flip() +
  xlab("") + 
  ylab("Proportion (Poor)") +
  theme_classic(15)

tmp_right = read_rds("tmp_right.rds")

p2 <- ggplot(tmp_right, aes(x = reorder(topic, prop), y = prop)) +
  geom_bar(stat = "identity", width = 0.6, fill = "#cdb4db", alpha = 0.8) +
  geom_text(aes(label = reorder(words, prop)), hjust = -0.2, size = 4) +
  ylim(0, 0.3) +
  coord_flip() +
  xlab("") + 
  ylab("Proportion (Rich)") +
  theme_classic(15)

ggarrange(p1, p2)
ggsave("topic_2.pdf", width = 10, height = 4.5)


# Figure 6: Text Similarity ####
dat_plot <- readRDS("simi_text.rds")

ggplot(data = dat_plot, aes(x = name, y = simu)) +
  geom_bar(stat = "identity", width = 0.6, fill = "#cdb4db", alpha = 0.8) +
  xlab("Case Type") +
  ylab("Similarity") +
  theme_classic(15) +
  coord_flip()

ggsave("simi_text.pdf", width = 5.5, height = 4)


# Figure 7: Correlation between Housing Price and Call Frequency ####

frequency = read_rds("frequency.rds") 
m1 = felm(n_l ~ price_l, frequency)

ggplot(frequency, aes(x = price_l, y = n_l)) + 
  geom_point(alpha = 0.5) +
  stat_smooth(col = "#cdb4db", method = "lm") +
  ylab("Number of Petitions (log+1)") +
  xlab("Price (logged)") +
  labs(title = paste("Slope =", signif(m1$coef[[2]], 5),
                     " P =", signif(summary(m1)$coef[2, 4], 5))) +
  theme_classic(15)

ggsave("freq_price.png", width = 5.5, height = 4)


############# Appendix ###########################################################

# Figure F.1: Provincial-Level Gini Coefficient ####
gini_df = read_rds("gini.rds")

gini_df %>% 
  mutate(cat = ifelse(province == "Shanghai", TRUE, FALSE)) %>% 
  ggplot(aes(reorder(province, gini2012), gini2012, fill = cat)) +
  geom_col(width = 0.9) +
  ylim(0, 1.01) +
  coord_flip(expand = FALSE) +
  labs(x = "", y = "Gini Coefficient") +
  scale_fill_brewer(palette = "RdPu", direction = -1) +
  guides(fill = "none") +
  theme_classic(15)

ggsave("gini_ranking.png", width = 5.5, height = 8)


# Figure F.2: Time Trend of Different Categories ####
df %>%
  filter(!is.na(v_class1_en)) %>%
  ggplot(aes(x = reorder(v_class1_en, v_class1_en, function(x) length(x)), fill = v_class1_en)) +
  geom_bar(alpha = 0.8) +
  coord_flip() +
  scale_fill_brewer(palette = "RdPu") +
  ylab("Number of Cases") +
  xlab("") +
  theme_classic(15) +
  theme(legend.title = element_blank(),
        legend.position = "none")

ggsave("time_trend_cat.png", width = 5.5, height = 4)


# Figure F.3: Community-Level Housing Price Trend ####
community_level = read_excel("fangjia.xlsx") %>% 
  mutate(price_per_sqm = as.numeric(price_per_sqm)) %>% 
  filter(district == "黄浦") %>%
  mutate(year_month = paste(year, month, "1", sep = "-") %>%
           ymd()) %>% 
  mutate(community = py(community, dic = pydic(dic = c("pinyin2")), sep = "") %>% 
           str_remove_all("\\d{1,}")) %>% 
  group_by(year, community) %>% 
  mutate(price_per_sqm = mean(price_per_sqm, na.rm = TRUE))

community_data <- community_level %>% 
  select(community, year, price_per_sqm) %>% 
  distinct()

ggplot(community_data,
       aes(x = year, y = price_per_sqm, 
           group = community, color = community)) +
  geom_line(size = 1.1, alpha = 0.3) +
  geom_point(size = 2, alpha = 0.3) +
  geom_text_repel(
    data = filter(community_data, year == "2016"),
    aes(color = community, label = community),
    max.overlaps = Inf,
    nudge_x = -0.2, 
    direction = "y",
    hjust = "right",
    size = 5
    ) +
  ylab("Housing Price (RMB per square meter)") +
  xlab("Year") +
  theme_classic(12) +
  theme(legend.position = "none")

ggsave("communt_level_price.png", width = 5.5, height = 4)


# Figure F.4: Bivariate Correlation ####
# Define a list of labels for your variables
labels <- c(res_speed = "Resolution Time",
            resolve_0 = "Positive Resolution",
            price_l = "Price",
            female = "Female",
            local = "Local",
            foregin = "Foreign",
            anonymous = "Anonymous",
            distance_km = "Distance to Neighborhood Center")

df %>% 
  select(res_speed, resolve_0, price_l, female, local, foregin, anonymous, distance_km) %>% 
  ggpairs(columnLabels = labels)

ggsave("bivariate_corr.png", width = 12, height = 12)


# Figure F.5: Estimation and Diagnostic of Instrumental Variables ####
library(ivDiag)

m <- ivDiag(
  df, "res_speed", "treatment", "price_l",
  controls = c("female","local","foregin","anonymous","distance_km","national_twosession","city_twosession","local_turnover"),
  FE = c("v_class1", "yearmonth"), cl = "apt_name", 
  weights = NULL, bootstrap = TRUE, run.AR = TRUE, 
  nboots = 1000, parallel = TRUE, cores = NULL, seed = 94305, prec = 4, debug = FALSE
  )

plot_coef(m)


# Figure G.1: Text Similarity ####
# Save the processed similarity plot data if needed
dat1_plot <- readRDS("simi_text1.rds")

ggplot(data = dat1_plot, aes(x = name, y = simu)) +
  geom_bar(stat = "identity", width = 0.6, fill = "#cdb4db", alpha = 0.8) +
  aes(stringr::str_wrap(name, 15)) +
  xlab("Case Type") + 
  ylab("Similarity") +
  theme_classic(12)

ggsave("simi_text1.pdf", width = 8, height = 5)

